## Warning: package 'e1071' was built under R version 4.0.5
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Warning: package 'plotly' was built under R version 4.0.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.4
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
house_votes_Rep = read_csv("~/Fall21/introDS/DS-3001-New/data/house_votes_Rep.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Last.Name = col_character(),
## party.labels = col_character(),
## aye = col_double(),
## nay = col_double(),
## other = col_double()
## )
table(house_votes_Rep$party.labels)
##
## Democrat Republican
## 198 229
#View(house_votes_Rep)
Goal: Know how to make decisions and answer questions using clustering.
Repeat the clustering process only using the Rep house votes dataset - What differences and similarities did you see between how the clustering worked for the datasets?
The main difference is now the republican cluster is the one with more aye votes and less nay votes vs the dem house votes dataset has the democratic cluster with more aye votes and less nay votes.
#Select the variables to be included in the cluster
clust_data_Rep = house_votes_Rep[, c("aye", "nay", "other")]
#Run the clustering algo with 2 centers
set.seed(1)
kmeans_obj_Rep = kmeans(clust_data_Rep, centers = 2,
algorithm = "Lloyd")
#View the results
kmeans_obj_Rep
## K-means clustering with 2 clusters of sizes 225, 202
##
## Cluster means:
## aye nay other
## 1 122.56889 106.9956 90.43556
## 2 70.32673 145.6337 104.03960
##
## Clustering vector:
## [1] 2 2 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 2 2 1 1 2 1
## [38] 1 2 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 1 1 2 2 1 2 2 2 2
## [75] 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 1 2 2 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1
## [149] 2 2 1 1 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 2 1 2 2
## [186] 1 1 1 1 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [223] 2 2 2 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1
## [260] 1 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 1 2 2 2
## [297] 2 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 2 2 1 1 1 2 1
## [334] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 2 2
## [371] 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 1 1 2 1
## [408] 1 2 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 43093.49 77671.01
## (between_SS / total_SS = 79.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Visualize the output
party_clusters_Rep = as.factor(kmeans_obj_Rep$cluster)
ggplot(house_votes_Rep, aes(x = aye,
y = nay,
color = party.labels, #<- tell R how to color
# the data points
shape = party_clusters_Rep)) +
geom_point(size = 6) +
ggtitle("Aye vs. Nay votes for Republican-introduced bills") +
xlab("Number of Aye Votes") +
ylab("Number of Nay Votes") +
scale_shape_manual(name = "Cluster",
labels = c("Cluster 1", "Cluster 2"),
values = c("1", "2")) +
scale_color_manual(name = "Party", #<- tell R which colors to use and
# which labels to include in the legend
labels = c("Deomcratic", "Republican"),
values = c("blue", "red")) +
theme_light()
#save as a png
ggsave("US House Votes for Rep Bills.png",
width = 10,
height = 5.62,
units = "in")
#Evaluate the quality of the clustering
# Inter-cluster variance,
# "betweenss" is the sum of the distances between points
# from different clusters.
num_Rep = kmeans_obj_Rep$betweenss
# Total variance, "totss" is the sum of the distances
# between all the points in the data set.
denom_Rep = kmeans_obj_Rep$totss
# Variance accounted for by clusters.
(var_exp_Rep = num_Rep / denom_Rep)
## [1] 0.7952692
#Use the function we created to evaluate several different number of clusters
# The function explained_variance wraps our code for calculating
# the variance explained by clustering.
explained_variance = function(data_in, k){
# Running the kmeans algorithm.
set.seed(1)
kmeans_obj = kmeans(data_in, centers = k, algorithm = "Lloyd", iter.max = 30)
# Variance accounted for by clusters:
# var_exp = intercluster variance / total variance
var_exp = kmeans_obj$betweenss / kmeans_obj$totss
var_exp
}
explained_var_Rep = sapply(1:10, explained_variance, data_in = clust_data_Rep)
#View(explained_var_Rep)
#Create a elbow chart of the output
# Data for ggplot2.
elbow_data_Rep = data.frame(k = 1:10, explained_var_Rep)
#View(elbow_data_Rep)
# Plotting data.
ggplot(elbow_data_Rep,
aes(x = k,
y = explained_var_Rep)) +
geom_point(size = 4) + #<- sets the size of the data points
geom_line(size = 1) + #<- sets the thickness of the line
xlab('k') +
ylab('Inter-cluster Variance / Total Variance') +
theme_light()
#Use NbClust to select a number of clusters
# Run NbClust.
(nbclust_obj_Rep = NbClust(data = clust_data_Rep, method = "kmeans"))
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## Warning in log(det(P)/det(W)): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 12 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 2 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 131.0817 1650.8972 140.8739 76.7950 NaN -534.8540 4148081409 120764.50
## 3 0.8084 1166.7437 100.0769 73.2048 NaN -299.2478 2092420152 90700.26
## 4 0.7187 992.4333 54.3510 68.7774 16304.01 253.9222 1364506819 73380.28
## 5 0.3692 851.5327 92.8867 65.0656 NaN -4.0054 1089970216 65025.23
## 6 0.6653 847.7351 109.7583 64.6344 16862.81 154.3630 711907875 53294.53
## 7 11.4461 906.7565 44.6891 65.8317 17305.83 74.4479 448521362 42273.48
## 8 0.5271 864.2396 44.9357 64.7719 NaN -36.2018 365507694 38208.04
## 9 2.2064 840.9147 25.6650 64.1975 17740.77 44.4388 300834537 34507.30
## 10 0.4156 794.3214 38.9806 63.0588 NaN -3.9473 274181075 32511.13
## 11 0.7418 783.7302 40.9588 62.8420 NaN -65.7978 223035146 29731.84
## 12 0.4787 784.4653 58.9331 62.9414 17615.49 105.9411 184024352 27066.87
## 13 1.2365 824.1301 52.1301 64.0516 18438.62 18.0883 142963370 23701.14
## 14 2.4682 858.4573 32.3669 65.0020 NaN -3.3429 111468031 21050.50
## 15 0.4565 859.8358 48.2537 65.1549 18991.72 6.5938 96034586 19520.66
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale
## 2 -1.196478e+14 127.4114 0.1605 0.5038 0.7149 1.1814 -31.0231 -0.2603
## 3 -3.098874e+14 169.6442 0.2177 0.7555 0.6854 4.2585 -152.2704 -1.2096
## 4 4.270813e+14 209.6854 0.1694 1.1057 0.4545 0.9574 7.2442 0.0717
## 5 -3.264814e+16 236.6277 0.1605 1.0231 0.4588 1.7872 -52.8556 -0.7360
## 6 8.442332e+14 288.7120 0.1330 1.1476 0.3324 1.7658 -32.9601 -0.7271
## 7 1.498087e+15 363.9817 0.1906 1.0181 0.3467 2.8985 -89.0786 -0.9757
## 8 -3.296079e+15 402.7103 0.1807 1.0080 0.3510 0.9977 0.2089 0.0039
## 9 2.747395e+15 445.8991 0.1743 0.9032 0.3627 0.5647 19.2747 1.2963
## 10 -3.297302e+16 473.2770 0.1938 1.0032 0.3274 6.8536 -116.1566 -1.3908
## 11 -2.061187e+15 517.5183 0.1906 0.9419 0.3232 0.7118 15.7916 0.6801
## 12 1.268548e+15 568.4725 0.1883 0.9347 0.3356 2.3591 -33.4148 -0.9416
## 13 6.598389e+15 649.1998 0.1706 0.9088 0.3436 2.0965 -11.5065 -0.8533
## 14 -3.306221e+16 730.9457 0.1539 0.8893 0.3603 6.8406 -61.4746 -1.3566
## 15 1.651659e+16 788.2302 0.2181 0.9026 0.3542 0.7263 24.1153 0.6289
## Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex
## 2 0.5589 60382.252 0.8806 0.5216 0.2779 0.1005 0 0.0763 13.4946
## 3 0.5020 30233.420 0.8967 5.6954 0.2853 0.0782 0 0.0934 12.5274
## 4 0.4533 18345.069 0.6993 2.3087 0.5622 0.0156 0 0.1517 11.0551
## 5 0.4101 13005.045 0.6586 1.5951 0.6556 0.0156 0 0.1620 10.2381
## 6 0.3805 8882.422 0.5582 1.0498 0.9561 0.0156 0 0.1535 9.2055
## 7 0.3580 6039.068 0.5250 1.4422 1.0747 0.0245 0 0.1471 8.4672
## 8 0.3360 4776.005 0.4959 0.8232 1.2010 0.0245 0 0.1573 7.8994
## 9 0.3192 3834.144 0.4855 2.2024 1.2426 0.0245 0 0.1627 7.5574
## 10 0.3019 3251.113 0.4598 2.5479 1.3859 0.0280 0 0.1973 7.2851
## 11 0.2896 2702.895 0.4418 3.0898 1.5030 0.0280 0 0.2265 7.0407
## 12 0.2791 2255.573 0.4241 0.3815 1.6339 0.0280 0 0.2725 6.6572
## 13 0.2688 1823.164 0.4117 0.1275 1.6649 0.0280 0 0.2661 6.3009
## 14 0.2603 1503.607 0.4107 -7.8443 1.5852 0.0280 0 0.2114 5.9222
## 15 0.2516 1301.377 0.3854 0.4523 1.8242 0.0394 0 0.2505 5.7283
## SDbw
## 2 0.2255
## 3 0.2677
## 4 0.4023
## 5 0.3697
## 6 0.4053
## 7 0.3781
## 8 0.3099
## 9 0.3628
## 10 0.2101
## 11 0.2945
## 12 0.2095
## 13 0.2059
## 14 0.1430
## 15 0.1779
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6390 114.1248 1.0000
## 3 0.2115 742.0166 1.0000
## 4 0.2887 401.6280 0.9749
## 5 0.4868 126.4976 1.0000
## 6 0.5151 71.5437 1.0000
## 7 0.0438 2971.3276 1.0000
## 8 0.5066 86.6886 0.9997
## 9 0.5413 21.1850 0.2763
## 10 0.3322 273.4259 1.0000
## 11 0.5318 34.3418 0.5650
## 12 0.3500 107.6921 1.0000
## 13 0.3414 42.4447 1.0000
## 14 0.2298 241.3514 1.0000
## 15 0.4783 69.8184 0.5974
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 2.0000 2.000 7.0000 2.000 13.0000 5.0000 3
## Value_Index 131.0817 1650.897 65.0692 76.795 823.1304 416.2959 2055661257
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.00 1.50000e+01 7.0000 6.000 2.0000 2.0000 2.0000
## Value_Index 12744.26 4.95788e+16 -36.5411 0.133 0.5038 0.7149 1.1814
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 2.0000 3.00 3.0000 1 2.0000
## Value_Index -31.0231 -0.2603 0.5589 30148.83 0.8967 NA 0.2779
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 2.0000 0 2.0000 0 14.000
## Value_Index 0.1005 0 0.0763 0 0.143
##
## $Best.partition
## [1] 2 2 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 2 2 1 1 2 1
## [38] 1 2 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 1 1 2 2 1 2 2 2 2
## [75] 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 1 2 2 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1
## [149] 2 2 1 1 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 2 1 2 2
## [186] 1 1 1 1 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [223] 2 2 2 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1
## [260] 1 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 1 2 2 2
## [297] 2 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 2 2 1 1 1 2 1
## [334] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 2 2
## [371] 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 1 1 2 1
## [408] 1 2 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 2 1 2
# View the output of NbClust.
nbclust_obj_Rep
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 131.0817 1650.8972 140.8739 76.7950 NaN -534.8540 4148081409 120764.50
## 3 0.8084 1166.7437 100.0769 73.2048 NaN -299.2478 2092420152 90700.26
## 4 0.7187 992.4333 54.3510 68.7774 16304.01 253.9222 1364506819 73380.28
## 5 0.3692 851.5327 92.8867 65.0656 NaN -4.0054 1089970216 65025.23
## 6 0.6653 847.7351 109.7583 64.6344 16862.81 154.3630 711907875 53294.53
## 7 11.4461 906.7565 44.6891 65.8317 17305.83 74.4479 448521362 42273.48
## 8 0.5271 864.2396 44.9357 64.7719 NaN -36.2018 365507694 38208.04
## 9 2.2064 840.9147 25.6650 64.1975 17740.77 44.4388 300834537 34507.30
## 10 0.4156 794.3214 38.9806 63.0588 NaN -3.9473 274181075 32511.13
## 11 0.7418 783.7302 40.9588 62.8420 NaN -65.7978 223035146 29731.84
## 12 0.4787 784.4653 58.9331 62.9414 17615.49 105.9411 184024352 27066.87
## 13 1.2365 824.1301 52.1301 64.0516 18438.62 18.0883 142963370 23701.14
## 14 2.4682 858.4573 32.3669 65.0020 NaN -3.3429 111468031 21050.50
## 15 0.4565 859.8358 48.2537 65.1549 18991.72 6.5938 96034586 19520.66
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale
## 2 -1.196478e+14 127.4114 0.1605 0.5038 0.7149 1.1814 -31.0231 -0.2603
## 3 -3.098874e+14 169.6442 0.2177 0.7555 0.6854 4.2585 -152.2704 -1.2096
## 4 4.270813e+14 209.6854 0.1694 1.1057 0.4545 0.9574 7.2442 0.0717
## 5 -3.264814e+16 236.6277 0.1605 1.0231 0.4588 1.7872 -52.8556 -0.7360
## 6 8.442332e+14 288.7120 0.1330 1.1476 0.3324 1.7658 -32.9601 -0.7271
## 7 1.498087e+15 363.9817 0.1906 1.0181 0.3467 2.8985 -89.0786 -0.9757
## 8 -3.296079e+15 402.7103 0.1807 1.0080 0.3510 0.9977 0.2089 0.0039
## 9 2.747395e+15 445.8991 0.1743 0.9032 0.3627 0.5647 19.2747 1.2963
## 10 -3.297302e+16 473.2770 0.1938 1.0032 0.3274 6.8536 -116.1566 -1.3908
## 11 -2.061187e+15 517.5183 0.1906 0.9419 0.3232 0.7118 15.7916 0.6801
## 12 1.268548e+15 568.4725 0.1883 0.9347 0.3356 2.3591 -33.4148 -0.9416
## 13 6.598389e+15 649.1998 0.1706 0.9088 0.3436 2.0965 -11.5065 -0.8533
## 14 -3.306221e+16 730.9457 0.1539 0.8893 0.3603 6.8406 -61.4746 -1.3566
## 15 1.651659e+16 788.2302 0.2181 0.9026 0.3542 0.7263 24.1153 0.6289
## Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex
## 2 0.5589 60382.252 0.8806 0.5216 0.2779 0.1005 0 0.0763 13.4946
## 3 0.5020 30233.420 0.8967 5.6954 0.2853 0.0782 0 0.0934 12.5274
## 4 0.4533 18345.069 0.6993 2.3087 0.5622 0.0156 0 0.1517 11.0551
## 5 0.4101 13005.045 0.6586 1.5951 0.6556 0.0156 0 0.1620 10.2381
## 6 0.3805 8882.422 0.5582 1.0498 0.9561 0.0156 0 0.1535 9.2055
## 7 0.3580 6039.068 0.5250 1.4422 1.0747 0.0245 0 0.1471 8.4672
## 8 0.3360 4776.005 0.4959 0.8232 1.2010 0.0245 0 0.1573 7.8994
## 9 0.3192 3834.144 0.4855 2.2024 1.2426 0.0245 0 0.1627 7.5574
## 10 0.3019 3251.113 0.4598 2.5479 1.3859 0.0280 0 0.1973 7.2851
## 11 0.2896 2702.895 0.4418 3.0898 1.5030 0.0280 0 0.2265 7.0407
## 12 0.2791 2255.573 0.4241 0.3815 1.6339 0.0280 0 0.2725 6.6572
## 13 0.2688 1823.164 0.4117 0.1275 1.6649 0.0280 0 0.2661 6.3009
## 14 0.2603 1503.607 0.4107 -7.8443 1.5852 0.0280 0 0.2114 5.9222
## 15 0.2516 1301.377 0.3854 0.4523 1.8242 0.0394 0 0.2505 5.7283
## SDbw
## 2 0.2255
## 3 0.2677
## 4 0.4023
## 5 0.3697
## 6 0.4053
## 7 0.3781
## 8 0.3099
## 9 0.3628
## 10 0.2101
## 11 0.2945
## 12 0.2095
## 13 0.2059
## 14 0.1430
## 15 0.1779
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6390 114.1248 1.0000
## 3 0.2115 742.0166 1.0000
## 4 0.2887 401.6280 0.9749
## 5 0.4868 126.4976 1.0000
## 6 0.5151 71.5437 1.0000
## 7 0.0438 2971.3276 1.0000
## 8 0.5066 86.6886 0.9997
## 9 0.5413 21.1850 0.2763
## 10 0.3322 273.4259 1.0000
## 11 0.5318 34.3418 0.5650
## 12 0.3500 107.6921 1.0000
## 13 0.3414 42.4447 1.0000
## 14 0.2298 241.3514 1.0000
## 15 0.4783 69.8184 0.5974
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 2.0000 2.000 7.0000 2.000 13.0000 5.0000 3
## Value_Index 131.0817 1650.897 65.0692 76.795 823.1304 416.2959 2055661257
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.00 1.50000e+01 7.0000 6.000 2.0000 2.0000 2.0000
## Value_Index 12744.26 4.95788e+16 -36.5411 0.133 0.5038 0.7149 1.1814
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 2.0000 3.00 3.0000 1 2.0000
## Value_Index -31.0231 -0.2603 0.5589 30148.83 0.8967 NA 0.2779
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 2.0000 0 2.0000 0 14.000
## Value_Index 0.1005 0 0.0763 0 0.143
##
## $Best.partition
## [1] 2 2 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 2 2 1 1 2 1
## [38] 1 2 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 1 1 2 2 1 2 2 2 2
## [75] 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 1 2 2 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1
## [149] 2 2 1 1 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 2 1 2 2
## [186] 1 1 1 1 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [223] 2 2 2 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1
## [260] 1 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 1 2 2 2
## [297] 2 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 2 2 1 1 1 2 1
## [334] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 2 2
## [371] 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 1 1 2 1
## [408] 1 2 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 2 1 2
# View the output that shows the number of clusters each method recommends.
#View(nbclust_obj_Rep$Best.nc)
#Display the results visually
freq_k_Rep = nbclust_obj_Rep$Best.nc[1,]
freq_k_Rep = data.frame(freq_k_Rep)
#View(freq_k_Rep)
# Check the maximum number of clusters suggested.
max(freq_k_Rep)
## [1] 15
#essentially resets the plot viewer back to default
#dev.off()
# Plot as a histogram.
ggplot(freq_k_Rep,
aes(x = freq_k_Rep)) +
geom_bar() +
scale_x_continuous(breaks = seq(0, 15, by = 1)) +
scale_y_continuous(breaks = seq(0, 12, by = 1)) +
labs(x = "Number of Clusters",
y = "Number of Votes",
title = "Cluster Analysis")
#Using the recommended number of cluster compare the quality of the model
#with 2 clusters
# Both the elbow graph and the nbc Cluster method recommend two clusters.
#Bonus: Create a 3d version of the output
party_color3D_Rep = data.frame(party.labels = c("Democrat", "Republican"),
color = c("blue", "red"))
View(party_color3D_Rep)
# Join the new data frame to our house_votes_Dem data set.
house_votes_color_Rep = inner_join(house_votes_Rep, party_color3D_Rep)
## Joining, by = "party.labels"
house_votes_color_Rep$clusters <- (party_clusters_Rep)
str(house_votes_color_Rep)
## spec_tbl_df[,7] [427 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Last.Name : chr [1:427] "Courtney" "Lewis" "Bera" "McCollum" ...
## $ party.labels: chr [1:427] "Democrat" "Democrat" "Democrat" "Democrat" ...
## $ aye : num [1:427] 66 59 84 74 127 125 125 67 97 68 ...
## $ nay : num [1:427] 163 145 141 154 103 91 95 142 99 147 ...
## $ other : num [1:427] 91 116 95 92 90 104 100 111 124 105 ...
## $ color : chr [1:427] "blue" "blue" "blue" "blue" ...
## $ clusters : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 1 2 ...
## - attr(*, "spec")=
## .. cols(
## .. Last.Name = col_character(),
## .. party.labels = col_character(),
## .. aye = col_double(),
## .. nay = col_double(),
## .. other = col_double()
## .. )
#Remove special characters
house_votes_color_Rep$Last.Name <- gsub("[^[:alnum:]]", "", house_votes_color_Rep$Last.Name)
# Use plotly to do a 3d imaging
fig <- plot_ly(house_votes_color_Rep,
type = "scatter3d",
mode="markers",
symbol = ~clusters,
x = ~aye,
y = ~nay,
z = ~other,
color = ~color,
colors = c('#0C4B8E','#BF382A'),
text = ~paste('Representative:',Last.Name,
"Party:",party.labels))
fig
# dev.off()
#Flat so the other category isn't impacting the clusters very much